#Load data
wd<-"C:/Users/MUNTEANU_A/Dropbox/Research/2018 JMP/"
wd_data_final<-paste0(wd,'data/final/')

pacman::p_load(tidyr)



setwd(wd_data_final)
####################################READ
setwd(wd_data_final)
data_student<-readRDS('data_student') %>%
  select(-starts_with(c("school_name_","lat","lon","lng","address","nume","adm_diff","adm_exam","adm_four"))) %>%
  select(-matches(c("orig","corres","unitate_de_invatamant_adm")))
data_student_teacher<-readRDS('data_student_teacher') %>%
  select(-starts_with(c("school_name_","lat","lon","lng","address","nume","adm_diff","adm_exam","adm_four"))) %>%
  select(-matches(c("orig","corres","unitate_de_invatamant_adm")))
data_student_teacher_expenditure<-readRDS('data_student_teacher_expenditure') %>%
  select(-starts_with(c("school_name_","lat","lon","lng","address","nume","adm_diff","adm_exam","adm_four"))) %>%
  select(-matches(c("orig","corres","unitate_de_invatamant_adm")))
data_teacher<-readRDS('data_teacher') %>%
  select(-Teacher,-Teacher.inspection,-Teacher.written,-Teacher.teacher,-School,-Hometown)
data_expenditure<-readRDS('data_expenditure')
data_expenditure[[1]]$Type<-'Direct'
colnames(data_expenditure[[1]])[2]<-'Castigator'
data_expenditure[[2]]$Type<-'Contract'
data_expenditure[[2]]<-data_expenditure[[2]] %>% mutate(CPVCodeID=as.factor(CPVCodeID))
data_expenditure<-bind_rows(data_expenditure[[1]],data_expenditure[[2]])
openings<-read.csv('openings.csv')

data_student_expenditure<-readRDS('data_student_expenditure') %>%
  select(-starts_with(c("school_name_","lat","lon","lng","address","nume","adm_diff","adm_exam","adm_four"))) %>%
  select(-matches(c("orig","corres","unitate_de_invatamant_adm")))


####################################Prepare codes
#get unique list of schools and towns
get_list<-function(name){
  result<-unique(c(as.character(data_student[[name]]),
                   as.character(data_teacher[[name]]),
                   as.character(data_student_teacher[[name]]),
                   as.character(data_student_expenditure[[name]]),
                   as.character(data_student_teacher_expenditure[[name]]),
                   as.character(data_student_expenditure[[name]]),
                   as.character(openings[[name]])))

  return(result)
}
list_hs_adm<-get_list("liceu_repartizat")
list_hs_grad_harmonized<-get_list("school_harmonized")
list_ms<-get_list("scoala_de_provenienta")
list_hs_grad<-get_list("unitate_de_invatamant")
list_schools<-unique(c(list_hs_adm,list_hs_grad_harmonized,list_ms,list_hs_grad))
# list_schools<-data.frame(school=unique(c(list_hs_adm,list_hs_grad_harmonized,list_ms,list_hs_grad)))
# list_schools<-list_schools %>% mutate(code=row_number()) 

list_town_adm<-get_list("town_hs_adm")
list_town_bac<-get_list("town_hs_bac")
list_town_bac_county<-get_list("town")
list_town_teacher<-get_list("Town")
list_towns<-unique(c(list_town_adm,list_town_bac,list_town_bac_county,list_town_teacher))
# list_towns<-data.frame(town=unique(c(list_town_adm,list_town_bac,list_town_bac_county,list_town_teacher)))
# list_towns<-list_towns %>% mutate(code=row_number()) 

list_hs_opening_1<-  c('ALESD','TOPOLOVENI')
list_hs_opening_2<-  c('TARGU NEAMT','TARGU LAPUS')

#add openings to student data before anonymizing
data_student<-data_student %>%
  ungroup() %>%
  mutate(opening_town_1=ifelse(town_hs_bac %in% list_hs_opening_1,T,F),
         opening_town_2=ifelse(town_hs_bac %in% list_hs_opening_2,T,F))

####################################Anonymize
anonymize<-function(x) {
  #anonymize schools
  cols_to_mask<-colnames(x %>% 
                           select(matches("unitate_de_invatamant|school_harmonized|liceu_repartizat|scoala_de_provenienta"))) 
  
  for (colname in cols_to_mask){
    print(colname)
    x$code<-match(x[[colname]],list_schools)
    x[[colname]]<-ifelse(x[[colname]]=='','',
                         ifelse(x[[colname]]=='HS NOT MATCHED','HS NOT MATCHED',x$code))
    
  }
  
  #anonymize towns
  cols_to_mask<-colnames(x %>% 
                           select(starts_with(c("town","Town")))) 
  
  for (colname in cols_to_mask){
    print(colname)
    x$code<-match(x[[colname]],list_towns)
    x[[colname]]<-ifelse(x[[colname]]=='','', x$code)
    
  }
  return(x)
}
data_student_anon<-anonymize(data_student)
data_teacher_anon<-anonymize(data_teacher)
data_expenditure_anon<-anonymize(data_expenditure)
data_student_teacher_anon<-anonymize(data_student_teacher)
data_student_teacher_expenditure_anon<-anonymize(data_student_teacher_expenditure)
data_student_expenditure_anon<-anonymize(data_student_expenditure)
openings_anon<-anonymize(openings)



# setwd(wd_data_final)
# saveRDS(data_student_anon,"data_student_anon",compress=FALSE)
# saveRDS(data_teacher_anon,"data_teacher_anon",compress=FALSE)
# saveRDS(data_expenditure_anon,"data_expenditure_anon",compress=FALSE)
# saveRDS(data_student_teacher_anon,"data_student_teacher_anon",compress=FALSE)
# saveRDS(data_student_expenditure_anon,"data_student_expenditure_anon",compress=FALSE)
# saveRDS(data_student_teacher_expenditure_anon,"data_student_teacher_expenditure",compress=FALSE)
# saveRDS(openings_anon,"openings_anon",compress=FALSE)


#subset variables
setwd(wd_data_final)
data_student_anon<-readRDS("data_student_anon") %>% 
  select(matches("SIRUTA|SIIIR|judet|^an|liceu|school_h|scoala_de|town|specializare|id_|unitate|n_|dist|rezultat|school_change|entrance_|grad_|class_|school_|dec|quart|med|Wages_hs|drop|Unemployment_hs",perl=T)) %>%
  select(-matches("cls|scl|_ID_|opening",perl=T)) %>%
  ungroup()
data_teacher_anon<-readRDS('data_teacher_anon') %>%
  select(matches("SIRUTA|SIIIR|judet|^an|liceu|school_h|scoala_de|town|specializare|id_|unitate|n_|dist|rezultat|school_change|entrance_|grad_|class_|school_|dec|quart|med|Wages_hs|drop|Unemployment_hs|teacher|County|Year",perl=T)) %>%
  select(-matches("_Level|Certification|Hometown|Long|County\\.|County_|ID|Inspection",perl=T)) %>%
  ungroup()
data_expenditure_anon<-readRDS('data_expenditure_anon') %>%
  select(matches("town|judet|ValoareEUR|^an|Type|unitate",perl=T)) %>%
  ungroup()

data_student_teacher_anon<-readRDS('data_student_teacher_anon') %>%
  select(matches("SIRUTA|SIIIR|judet|^an|liceu|school_h|scoala_de|town|specializare|id_|unitate|n_|dist|rezultat|school_change|entrance_|grad_|class_|school_|dec|quart|med|Wages_hs|drop|Unemployment_hs|teacher|County|Year",perl=T)) %>%
  select(-matches("cls|scl|_ID_|opening",perl=T)) %>%
  ungroup()
data_student_expenditure_anon<-readRDS('data_student_expenditure_anon') %>%
  select(matches("Exp|SIRUTA|SIIIR|judet|^an|liceu|school_h|scoala_de|town|specializare|id_|unitate|n_|dist|rezultat|school_change|entrance_|grad_|class_|school_|dec|quart|med|Wages_hs|drop|Unemployment_hs|Exp",perl=T)) %>%
  select(-matches("cls|scl|_ID_|opening",perl=T)) %>%
  ungroup()


data_student_teacher_expenditure_anon<-readRDS("data_student_teacher_expenditure_anon") %>% 
  select(matches("SIRUTA|SIIIR|judet|^an|liceu|school_h|scoala_de|town|specializare|id_|unitate|n_|dist|rezultat|school_change|entrance_|grad_|class_|school_|dec|quart|med|Wages_hs|drop|Unemployment_hs|teacher|County|Year|Exp|ValoareEUR|Type|subject|mandatory|elective|disciplina|lb_romana|gpa|Exeprience|Category",perl=T)) %>%
  select(-matches("cls|scl|_ID_|opening",perl=T)) %>%
  ungroup()

setwd(wd_data_final)
saveRDS(data_student_anon,"data_student_anon",compress=T)
saveRDS(data_teacher_anon,"data_teacher_anon",compress=T)
saveRDS(data_expenditure_anon,"data_expenditure_anon",compress=T)
saveRDS(data_student_teacher_anon,"data_student_teacher_anon",compress=T)
saveRDS(data_student_expenditure_anon,"data_student_expenditure_anon",compress=T)
saveRDS(data_student_teacher_expenditure_anon,"data_student_teacher_expenditure",compress=T)
saveRDS(openings_anon,"openings_anon",compress=T)

  
